{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 08 Marginal and conditional distributions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%html\n",
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import plotly.graph_objects as go\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import findspark\n",
"\n",
"findspark.init()\n",
"from pyspark.context import SparkContext\n",
"from pyspark.sql import functions as F\n",
"from pyspark.sql.session import SparkSession\n",
"\n",
"spark = SparkSession.builder.appName(\"statistics\").master(\"local\").getOrCreate()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[khanacademy](https://www.khanacademy.org/math/ap-statistics/analyzing-categorical-ap/distributions-two-way-tables/v/marginal-distribution-and-conditional-distribution?modal=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"dataset = {\n",
" \"#\": [\"80-100\", \"60-79\", \"40-59\", \"20-39\", \"0-19\"],\n",
" \"0-20\": [0, 0, 2, 10, 2],\n",
" \"21-40\": [4, 20, 4, 2, 0],\n",
" \"41-60\": [16, 30, 32, 8, 0],\n",
" \">60\": [20, 10, 32, 0, 8],\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0-20 | \n",
" 21-40 | \n",
" 41-60 | \n",
" >60 | \n",
"
\n",
" \n",
" # | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 80-100 | \n",
" 0 | \n",
" 4 | \n",
" 16 | \n",
" 20 | \n",
"
\n",
" \n",
" 60-79 | \n",
" 0 | \n",
" 20 | \n",
" 30 | \n",
" 10 | \n",
"
\n",
" \n",
" 40-59 | \n",
" 2 | \n",
" 4 | \n",
" 32 | \n",
" 32 | \n",
"
\n",
" \n",
" 20-39 | \n",
" 10 | \n",
" 2 | \n",
" 8 | \n",
" 0 | \n",
"
\n",
" \n",
" 0-19 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 8 | \n",
"
\n",
" \n",
" Total | \n",
" 14 | \n",
" 30 | \n",
" 86 | \n",
" 70 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0-20 21-40 41-60 >60\n",
"# \n",
"80-100 0 4 16 20\n",
"60-79 0 20 30 10\n",
"40-59 2 4 32 32\n",
"20-39 10 2 8 0\n",
"0-19 2 0 0 8\n",
"Total 14 30 86 70"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(dataset).set_index(\"#\")\n",
"df = df.append(df.sum().rename(\"Total\"))\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+----+-----+-----+---+\n",
"| #|0-20|21-40|41-60|>60|\n",
"+------+----+-----+-----+---+\n",
"|80-100| 0| 4| 16| 20|\n",
"| 60-79| 0| 20| 30| 10|\n",
"| 40-59| 2| 4| 32| 32|\n",
"| 20-39| 10| 2| 8| 0|\n",
"| 0-19| 2| 0| 0| 8|\n",
"+------+----+-----+-----+---+\n",
"\n"
]
}
],
"source": [
"sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))\n",
"sdf.registerTempTable(\"sdf_table\")\n",
"sdf.show()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0-20 | \n",
" 21-40 | \n",
" 41-60 | \n",
" >60 | \n",
"
\n",
" \n",
" \n",
" \n",
" 80-100 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 16.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 60-79 | \n",
" 0.0 | \n",
" 20.0 | \n",
" 30.0 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 40-59 | \n",
" 2.0 | \n",
" 4.0 | \n",
" 32.0 | \n",
" 32.0 | \n",
"
\n",
" \n",
" 20-39 | \n",
" 10.0 | \n",
" 2.0 | \n",
" 8.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 0-19 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 8.0 | \n",
"
\n",
" \n",
" Total | \n",
" 14.0 | \n",
" 30.0 | \n",
" 86.0 | \n",
" 70.0 | \n",
"
\n",
" \n",
" Percentage | \n",
" 7.0 | \n",
" 15.0 | \n",
" 43.0 | \n",
" 35.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0-20 21-40 41-60 >60\n",
"80-100 0.0 4.0 16.0 20.0\n",
"60-79 0.0 20.0 30.0 10.0\n",
"40-59 2.0 4.0 32.0 32.0\n",
"20-39 10.0 2.0 8.0 0.0\n",
"0-19 2.0 0.0 0.0 8.0\n",
"Total 14.0 30.0 86.0 70.0\n",
"Percentage 7.0 15.0 43.0 35.0"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.append(\n",
" pd.DataFrame(\n",
" [df.iloc[-1] / df.iloc[-1].sum() * 100],\n",
" index=[\"Percentage\"],\n",
" columns=df.columns,\n",
" )\n",
")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0-20 | \n",
" 21-40 | \n",
" 41-60 | \n",
" >60 | \n",
" Total | \n",
" Percentage | \n",
"
\n",
" \n",
" \n",
" \n",
" 80-100 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 16.0 | \n",
" 20.0 | \n",
" 40.0 | \n",
" 8.0 | \n",
"
\n",
" \n",
" 60-79 | \n",
" 0.0 | \n",
" 20.0 | \n",
" 30.0 | \n",
" 10.0 | \n",
" 60.0 | \n",
" 12.0 | \n",
"
\n",
" \n",
" 40-59 | \n",
" 2.0 | \n",
" 4.0 | \n",
" 32.0 | \n",
" 32.0 | \n",
" 70.0 | \n",
" 14.0 | \n",
"
\n",
" \n",
" 20-39 | \n",
" 10.0 | \n",
" 2.0 | \n",
" 8.0 | \n",
" 0.0 | \n",
" 20.0 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 0-19 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 8.0 | \n",
" 10.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" Total | \n",
" 14.0 | \n",
" 30.0 | \n",
" 86.0 | \n",
" 70.0 | \n",
" 200.0 | \n",
" NaN | \n",
"
\n",
" \n",
" Percentage | \n",
" 7.0 | \n",
" 15.0 | \n",
" 43.0 | \n",
" 35.0 | \n",
" 100.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0-20 21-40 41-60 >60 Total Percentage\n",
"80-100 0.0 4.0 16.0 20.0 40.0 8.0\n",
"60-79 0.0 20.0 30.0 10.0 60.0 12.0\n",
"40-59 2.0 4.0 32.0 32.0 70.0 14.0\n",
"20-39 10.0 2.0 8.0 0.0 20.0 4.0\n",
"0-19 2.0 0.0 0.0 8.0 10.0 2.0\n",
"Total 14.0 30.0 86.0 70.0 200.0 NaN\n",
"Percentage 7.0 15.0 43.0 35.0 100.0 NaN"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Total\"] = df.sum(axis=1)\n",
"df[\"Percentage\"] = df[\"Total\"] / df[\"Total\"].sum() * 100\n",
"df.loc[\"Total\"][\"Percentage\"] = np.nan\n",
"df.loc[\"Percentage\"][\"Percentage\"] = np.nan\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+----+-----+-----+---+-----+----------+\n",
"| #|0-20|21-40|41-60|>60|Total|Percentage|\n",
"+------+----+-----+-----+---+-----+----------+\n",
"|80-100| 0| 4| 16| 20| 40| 20.0|\n",
"| 60-79| 0| 20| 30| 10| 60| 30.0|\n",
"| 40-59| 2| 4| 32| 32| 70| 35.0|\n",
"| 20-39| 10| 2| 8| 0| 20| 10.0|\n",
"| 0-19| 2| 0| 0| 8| 10| 5.0|\n",
"+------+----+-----+-----+---+-----+----------+\n",
"\n"
]
}
],
"source": [
"sdf = sdf.withColumn(\n",
" \"Total\", F.col(\"0-20\") + F.col(\"21-40\") + F.col(\"41-60\") + F.col(\">60\")\n",
")\n",
"sdf = sdf.withColumn(\n",
" \"Percentage\", F.col(\"Total\") / sdf.select(F.sum(\"Total\")).collect()[0][0] * 100\n",
")\n",
"sdf.show()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0-20 | \n",
" 21-40 | \n",
" 41-60 | \n",
" >60 | \n",
" Total | \n",
" Percentage | \n",
"
\n",
" \n",
" \n",
" \n",
" Total | \n",
" 14.0 | \n",
" 30.0 | \n",
" 86.0 | \n",
" 70.0 | \n",
" 200.0 | \n",
" NaN | \n",
"
\n",
" \n",
" Percentage | \n",
" 7.0 | \n",
" 15.0 | \n",
" 43.0 | \n",
" 35.0 | \n",
" 100.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0-20 21-40 41-60 >60 Total Percentage\n",
"Total 14.0 30.0 86.0 70.0 200.0 NaN\n",
"Percentage 7.0 15.0 43.0 35.0 100.0 NaN"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"marginal_dist_v_df = df.iloc[-2:]\n",
"marginal_dist_v_df"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Total | \n",
" Percentage | \n",
"
\n",
" \n",
" \n",
" \n",
" 80-100 | \n",
" 40.0 | \n",
" 8.0 | \n",
"
\n",
" \n",
" 60-79 | \n",
" 60.0 | \n",
" 12.0 | \n",
"
\n",
" \n",
" 40-59 | \n",
" 70.0 | \n",
" 14.0 | \n",
"
\n",
" \n",
" 20-39 | \n",
" 20.0 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 0-19 | \n",
" 10.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" Total | \n",
" 200.0 | \n",
" NaN | \n",
"
\n",
" \n",
" Percentage | \n",
" 100.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Total Percentage\n",
"80-100 40.0 8.0\n",
"60-79 60.0 12.0\n",
"40-59 70.0 14.0\n",
"20-39 20.0 4.0\n",
"0-19 10.0 2.0\n",
"Total 200.0 NaN\n",
"Percentage 100.0 NaN"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"marginal_dist_h_df = df.loc[:, \"Total\":]\n",
"marginal_dist_h_df"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+-----+----------+\n",
"| #|Total|Percentage|\n",
"+------+-----+----------+\n",
"|80-100| 40| 20.0|\n",
"| 60-79| 60| 30.0|\n",
"| 40-59| 70| 35.0|\n",
"| 20-39| 20| 10.0|\n",
"| 0-19| 10| 5.0|\n",
"+------+-----+----------+\n",
"\n"
]
}
],
"source": [
"sdf.select(\"#\", \"Total\", \"Percentage\").show()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 41-60 | \n",
" Percentage | \n",
"
\n",
" \n",
" \n",
" \n",
" 80-100 | \n",
" 16.0 | \n",
" 18.604651 | \n",
"
\n",
" \n",
" 60-79 | \n",
" 30.0 | \n",
" 34.883721 | \n",
"
\n",
" \n",
" 40-59 | \n",
" 32.0 | \n",
" 37.209302 | \n",
"
\n",
" \n",
" 20-39 | \n",
" 8.0 | \n",
" 9.302326 | \n",
"
\n",
" \n",
" 0-19 | \n",
" 0.0 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 41-60 Percentage\n",
"80-100 16.0 18.604651\n",
"60-79 30.0 34.883721\n",
"40-59 32.0 37.209302\n",
"20-39 8.0 9.302326\n",
"0-19 0.0 0.000000"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conditional_dist_df = pd.DataFrame(df.loc[:, \"41-60\"])\n",
"conditional_dist_df = conditional_dist_df.iloc[:-2]\n",
"conditional_dist_df[\"Percentage\"] = (\n",
" conditional_dist_df[\"41-60\"] / conditional_dist_df[\"41-60\"].sum() * 100\n",
")\n",
"conditional_dist_df"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+-----+------------------+\n",
"| #|41-60| Percentage|\n",
"+------+-----+------------------+\n",
"|80-100| 16| 18.6046511627907|\n",
"| 60-79| 30|34.883720930232556|\n",
"| 40-59| 32| 37.2093023255814|\n",
"| 20-39| 8| 9.30232558139535|\n",
"| 0-19| 0| 0.0|\n",
"+------+-----+------------------+\n",
"\n"
]
}
],
"source": [
"sdf.select(\"#\", \"41-60\").withColumn(\n",
" \"Percentage\", F.col(\"41-60\") / sdf.select(F.sum(\"41-60\")).collect()[0][0] * 100\n",
").show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}